{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import twitter\n",
    "consumer_key = \"<Your Consumer Key Here>\"\n",
    "consumer_secret = \"<Your Consumer Secret Here>\"\n",
    "access_token = \"<Your Access Token Here>\"\n",
    "access_token_secret = \"<Your Access Token Secret Here>\"\n",
    "authorization = twitter.OAuth(access_token, \n",
    "access_token_secret, consumer_key, consumer_secret)\n",
    "t = twitter.Twitter(auth=authorization, retry=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os \n",
    "data_folder = os.path.join(\"twitter\")\n",
    "output_filename = os.path.join(data_folder, \"python_tweets.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "original_users = [] \n",
    "tweets = []\n",
    "user_ids = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "search_results = t.search.tweets(q=\"python\", count=100)['statuses']\n",
    "for tweet in search_results:\n",
    "    if 'text' in tweet:\n",
    "        original_users.append(tweet['user']['screen_name']) \n",
    "        user_ids[tweet['user']['screen_name']] = tweet['user']['id']\n",
    "        tweets.append(tweet['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_filename = os.path.join(\"models\", \"python_context.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import spacy\n",
    "from sklearn.base import TransformerMixin\n",
    "\n",
    "# Create a spaCy parser\n",
    "nlp = spacy.load('en')\n",
    "\n",
    "\n",
    "class BagOfWords(TransformerMixin):\n",
    "    def fit(self, X, y=None):\n",
    "        return self\n",
    " \n",
    "    def transform(self, X):\n",
    "        results = []\n",
    "        for document in X:\n",
    "            row = {}\n",
    "            for word in list(nlp(document, tag=False, parse=False, entity=False)):\n",
    "                if len(word.text.strip()): # Ignore words that are just whitespace\n",
    "                    row[word.text] = True\n",
    "                    results.append(row)\n",
    "        return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.externals import joblib\n",
    "context_classifier = joblib.load(model_filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_pred = context_classifier.predict(tweets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "relevant_tweets = [tweets[i] for i in range(len(tweets)) if y_pred[i] == 1]\n",
    "relevant_users = [original_users[i] for i in range(len(tweets)) if y_pred[i] == 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "def get_friends(t, user_id):\n",
    "    friends = []\n",
    "    cursor = -1\n",
    "    while cursor != 0: \n",
    "        try:\n",
    "            results = t.friends.ids(user_id= user_id, cursor=cursor, count=5000)\n",
    "            friends.extend([friend for friend in results['ids']])\n",
    "            cursor = results['next_cursor'] \n",
    "            if True or len(friends) >= 10000:\n",
    "                break\n",
    "        except TypeError as e:\n",
    "            if results is None:\n",
    "                print(\"You probably reached your API limit, waiting for 5 minutes\")\n",
    "                sys.stdout.flush() \n",
    "                time.sleep(5*60) # 5 minute wait \n",
    "            else: \n",
    "                # Some other error happened, so raise the error as normal\n",
    "                raise e\n",
    "        except twitter.TwitterHTTPError as e:\n",
    "            print(e)\n",
    "            break\n",
    "        finally:\n",
    "            # Break regardless -- this stops us going over our API limit\n",
    "            time.sleep(60)\n",
    "    return friends"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "friends = {} \n",
    "for screen_name in relevant_users:\n",
    "    user_id = user_ids[screen_name]\n",
    "    friends[user_id] = get_friends(t, user_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "friends = {user_id:friends[user_id] \n",
    "           for user_id in friends\n",
    "           if len(friends[user_id]) > 0}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "def count_friends(friends): \n",
    "    friend_count = defaultdict(int)\n",
    "    for friend_list in friends.values(): \n",
    "        for friend in friend_list:\n",
    "            friend_count[friend] += 1 \n",
    "    return friend_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "friend_count = count_friends(friends)\n",
    "from operator import itemgetter\n",
    "best_friends = sorted(friend_count, key=friend_count.get, reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "best_friends"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "while len(friends) < 150:\n",
    "    for user_id in best_friends:\n",
    "        if user_id in friends:\n",
    "            continue\n",
    "        print(user_id)\n",
    "        sys.stdout.flush()\n",
    "        friends[user_id] = get_friends(t, user_id) \n",
    "        for friend in friends[user_id]: \n",
    "            friend_count[friend] += 1\n",
    "        best_friends = sorted(friend_count.items(), key=itemgetter(1), reverse=True)\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "len(friends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "friends_filename = os.path.join(data_folder, \"python_friends.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "with open(friends_filename, 'w') as outf: \n",
    "    json.dump(friends, outf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open(friends_filename) as inf:\n",
    "    friends = json.load(inf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "len(friends), type(friends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import networkx as nx \n",
    "G = nx.DiGraph()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "main_users = friends.keys() \n",
    "G.add_nodes_from(main_users)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for user_id in friends:\n",
    "    for friend in friends[user_id]:\n",
    "        if str(friend) in main_users: \n",
    "            G.add_edge(user_id, friend) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nx.draw?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline \n",
    "nx.draw(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "plt.figure(3,figsize=(20,20))\n",
    "nx.draw(G, alpha=0.1, edge_color='b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "friends = {user: set(friends[user]) for user in friends}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def compute_similarity(friends1, friends2):\n",
    "    return len(friends1 & friends2) / (len(friends1 | friends2) + 1e-6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_graph(followers, threshold=0): \n",
    "    G = nx.Graph()\n",
    "    for user1 in friends.keys(): \n",
    "        for user2 in friends.keys(): \n",
    "            if user1 == user2:\n",
    "                continue\n",
    "            weight = compute_similarity(friends[user1], friends[user2])\n",
    "            if weight >= threshold:\n",
    "                G.add_node(user1) \n",
    "                G.add_node(user2)\n",
    "                G.add_edge(user1, user2, weight=weight)\n",
    "    return G"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "G = create_graph(friends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "pos = nx.spring_layout(G)\n",
    "nx.draw_networkx_nodes(G, pos)\n",
    "edgewidth = [ d['weight'] for (u,v,d) in G.edges(data=True)]\n",
    "nx.draw_networkx_edges(G, pos, width=edgewidth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "G = create_graph(friends, 0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    n_nodes = len(sub_graph.nodes()) \n",
    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "G = create_graph(friends, 0.25) \n",
    "sub_graphs = nx.connected_component_subgraphs(G) \n",
    "for i, sub_graph in enumerate(sub_graphs): \n",
    "    n_nodes = len(sub_graph.nodes()) \n",
    "    print(\"Subgraph {0} has {1} nodes\".format(i, n_nodes))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G) \n",
    "n_subgraphs = nx.number_connected_components(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "n_subgraphs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sub_graphs = nx.connected_component_subgraphs(G) \n",
    "n_subgraphs = nx.number_connected_components(G)\n",
    "\n",
    "fig = plt.figure(figsize=(20, (n_subgraphs * 3)))\n",
    "for i, sub_graph in enumerate(sub_graphs):\n",
    "    \n",
    "    ax = fig.add_subplot(int(n_subgraphs / 3)+1, 3, i+1)\n",
    "    ax.get_xaxis().set_visible(False) \n",
    "    ax.get_yaxis().set_visible(False)\n",
    "    nx.draw(sub_graph, ax=ax)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print(\"Done\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.metrics import silhouette_score\n",
    "\n",
    "def compute_silhouette(threshold, friends):\n",
    "    G = create_graph(friends, threshold=threshold) \n",
    "    if len(G.nodes()) < 2:\n",
    "        return -99\n",
    "    sub_graphs = nx.connected_component_subgraphs(G)\n",
    "\n",
    "    if not (2 <= nx.number_connected_components(G) < len(G.nodes()) - 1): \n",
    "        return -99\n",
    "\n",
    "    label_dict = {}\n",
    "    for i, sub_graph in enumerate(sub_graphs): \n",
    "        for node in sub_graph.nodes(): \n",
    "            label_dict[node] = i\n",
    "\n",
    "    labels = np.array([label_dict[node] for node in G.nodes()])\n",
    "    X = nx.to_scipy_sparse_matrix(G).todense()\n",
    "    X = 1 - X\n",
    "    return silhouette_score(X, labels, metric='precomputed')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def inverted_silhouette(threshold, friends):\n",
    "    return -compute_silhouette(threshold, friends)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from scipy.optimize import minimize\n",
    "result = minimize(inverted_silhouette, 0.1, args=(friends,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
